library(data.table)
library(tidyr)
#read the data (Wave 5)
# Data of Wave 5
WV5_data <- readRDS("/Users/cristinacandido/Documents/Github/risk_wvs/data/WVS/F00007944-WV5_Data_R_v20180912.rds")
# Convert WV5_data-object in data.frame
WV5_data_df <- as.data.frame(WV5_data)
# show first five columns
head(WV5_data_df[, 1:5])
library(dplyr)
#rename the variables
WV5_data <- WV5_data_df %>%
rename(risk_and_adventure = V86, sex = V235, age = V237, country = V2, wave = V1)
WV5_data
#select only the variables of interest
WV5_data <- WV5_data %>%
select(risk_and_adventure, sex, age, country, wave)
WV5_data
#exlcusion of participants with no info about risk, sex, age, employment, merital status and children
WV5_data_df = subset(WV5_data, risk_and_adventure > 0 & sex > 0 & age >0)
#decode the country names
countrynames = read.csv("/Users/cristinacandido/Documents/Github/risk_wvs/data/WVS/countrynames.txt", header=FALSE,as.is=TRUE)
colnames(countrynames) = c("code", "name")
WV5_data$country_lab = countrynames$name [match(WV5_data$country, countrynames$code)]
table(WV5_data$country_lab)
Andorra Argentina Australia Brazil Bulgaria
1003 1002 1421 1500 1001
Burkina Faso Canada Chile China Colombia
1534 2164 1000 1991 3025
Cyprus (G) Egypt Ethiopia Finland France
1050 3051 1500 1014 1001
Georgia Germany Ghana Great Britain Guatemala
1500 2064 1534 1041 1000
Hong Kong Hungary India Indonesia Iran
1252 1007 2001 2015 2667
Iraq Italy Japan Jordan Malaysia
2701 1012 1096 1200 1201
Mali Mexico Moldova Morocco Netherlands
1534 1560 1046 1200 1050
New Zealand Norway Peru Poland Romania
954 1025 1500 1000 1776
Russia Rwanda Slovenia South Africa South Korea
2033 1507 1037 2988 1200
Spain Sweden Switzerland Taiwan Thailand
1200 1003 1241 1227 1534
Trinidad and Tobago Turkey Ukraine United States Uruguay
1002 1346 1000 1249 1000
Viet Nam Zambia
1495 1500
WV5_data
NA
NA
#Read Dataset (Wave 6)
WV6_data <- load("/Users/cristinacandido/Documents/Github/risk_wvs/data/WVS/WV6_Data_R_v20201117.rdata")
WV6_data <- WV6_Data_R_v20201117
print(WV6_data)
` ``{r} #rename variables
WV6_data <- WV6_data %>%
rename(wave = V1, risk_and_adventure = V76, sex = V240, age = V242, education = V237, country = V2)
#select only the variables of interest
WV6_data <- WV6_data %>%
select(risk_and_adventure, sex, age, country, wave)
WV6_data
NA
#decode daraset (Wave 6)
countrynames = read.csv("/Users/cristinacandido/Documents/Github/risk_wvs/data/WVS/countrynames.txt", header=FALSE,as.is=TRUE)
colnames(countrynames) = c("code", "name")
WV6_data$country_lab = countrynames$name [match(WV6_data$country, countrynames$code)]
table(WV6_data$country_lab)
Algeria Argentina Armenia Australia Azerbaijan
1200 1030 1100 1477 1002
Belarus Brazil Chile China Colombia
1535 1486 1000 2300 1512
Cyprus (G) Ecuador Egypt Estonia Georgia
1000 1202 1523 1533 1202
Germany Ghana Haiti Hong Kong India
2046 1552 1996 1000 4078
Iraq Japan Jordan Kazakhstan Kuwait
1200 2443 1200 1500 1303
Kyrgyzstan Lebanon Libya Malaysia Mexico
1500 1200 2131 1300 2000
Morocco Netherlands New Zealand Nigeria Pakistan
1200 1902 841 1759 1200
Palestine Peru Philippines Poland Qatar
1000 1210 1200 966 1060
Romania Russia Rwanda Singapore Slovenia
1503 2500 1527 1972 1069
South Africa South Korea Spain Sweden Taiwan
3531 1200 1189 1206 1238
Thailand Trinidad and Tobago Tunisia Turkey Ukraine
1200 999 1205 1605 1500
United States Uruguay Uzbekistan Yemen Zimbabwe
2232 1000 1500 1000 1500
WV6_data
#exclude participants with no info about risk, sex, and age
WV6_data = subset(WV6_data, risk_and_adventure > 0 & sex > 0 & age >0)
#combine the 2 dataset (Wave 6 + Wave 5)
data = rbind(WV5_data, WV6_data)
data
#number of countries
length(unique(data$country_lab))
[1] 80
#number of participants
nrow(data)
[1] 170195
#exclusion of participants
data = subset(data, risk_and_adventure > 0 & sex > 0 & age > 0)
data
NA
#number of males vs females (1 = males; 2 = females)
table(data$sex)
1 2
75737 81963
#create a categorical age variable
data$agecat[data$age<20]="15-19"
data$agecat[data$age>=20 & data$age <30] = "20-29"
data$agecat[data$age>=30 & data$age <40] = "30-39"
data$agecat[data$age>=40 & data$age <50] = "40-49"
data$agecat[data$age>=50 & data$age <60] = "50-59"
data$agecat[data$age>=60 & data$age <70] = "60-69"
data$agecat[data$age>=70 & data$age <80] = "70-79"
data$agecat[data$age>=80] = "80+"
#gender variables
data$sex[data$sex == 1] <- "male"
data$sex[data$sex == 2] <- "female"
#average age of participants
mean(data$age)
#age range
range(data$age)
#risk taking Frequency
ggplot(data, aes(x = risk_and_adventure)) +
geom_histogram(binwidth = 0.5, fill = "lightblue", color = "black") +
labs(x = "Risk Taking", y = "Frequency", title = "Histogram of Risk Taking") +
theme_minimal()
#age frequency
ggplot(data, aes(x = age)) +
geom_histogram(binwidth = 0.5, fill = "lightblue", color = "black") +
labs(x = "Age", y = "Frequency", title = "Histogram of Age Distributionn") +
theme_minimal()
#age vs risk taking
ggplot(data, aes(x = agecat, y = risk_and_adventure)) +
geom_boxplot() +
labs(title = "Boxplot of Risk and Adventure by Age",
x = "Age",
y = "Risk and Adventure") +
theme_minimal()
NA
NA
#sex vs risk taking
ggplot(data, aes(as.factor(sex), risk_and_adventure ))+
geom_boxplot()
data